1 Objective

The objective of this notebook is to create a Seurat object, assess the quality of the data and filter out the low-quality cells. We followed the procedure described by a specific Signac’s vignette entitled “Joint RNA and ATAC analysis: 10x multiomic”.

2 Pre-processing

2.1 Load packages

library(Signac)
library(Seurat)
library(ggplot2)
library(ggpubr)
library(tidyverse)
library(plyr)
library(reshape2)
library(data.table)
library(GenomicRanges)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(hdf5r)
library(stringr)
set.seed(173)

2.2 Parameters

# Thresholds
TSS_enrichment <- 2 
nucleosome_signal_atac <- 2
min_lib_size_atac<- 500
max_lib_size_atac <- 100000

min_lib_size_rna <- 550
max_lib_size_rna <- 40000
min_n_genes <- 250
max_n_genes <- 6000
max_pct_mt <- 20

2.3 Parameters

# Paths
path_to_data_exp1 <- here::here("multiome/results/Experiment_1/")
path_to_data_exp2 <- here::here("multiome/results/Experiment_2/")
path_to_save <- here::here("multiome/results/R_objects/")

2.4 Functions

plot_histogram_qc <- function(df, x, x_lab) {
  df %>%
    ggplot(aes_string(x)) +
    geom_histogram(bins = 100) +
    labs(x = x_lab, y = "Number of Cells") +
    theme_pubr()
}

2.5 Gene annotation

Extraction of gene annotations from EnsDb using hg38 as the reference assembly.

annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"
genome(annotation) <- "hg38"

3 Load data

Signac uses information from three related input files (created using CellRanger ARC):

  1. Count matrix in h5 format
  2. ATAC Fragment file
  3. ATAC Fragment file index
read_atac_data <- function(url_project,library_name) {
  counts <- Seurat::Read10X_h5(filename = paste0(url_project, "filtered_feature_bc_matrix.h5"))
  
  # create a Seurat object containing the scRNA adata
  tonsil <- Seurat::CreateSeuratObject(
  counts = counts$`Gene Expression`,
  assay = "RNA")

  # create a Seurat object containing the scATAC adata
  tonsil[["ATAC"]] <- Signac::CreateChromatinAssay(
    counts = counts$Peaks,
    sep = c(":", "-"),
    genome = "hg38",
    fragments = paste0(url_project, "atac_fragments.tsv.gz"),
    annotation = annotation)
  
  tonsil$library_name = library_name
  return(tonsil)
}

3.1 Read samples from Experiment 1 and assign them to a proper library name

BCLL_9_T_1 <- read_atac_data(paste0(path_to_data_exp1,"co7dzuup_xuczw9vc/"),"BCLL_9_T_1")
BCLL_9_T_2 <- read_atac_data(paste0(path_to_data_exp1,"qmzb59ew_t11l8dzm/"),"BCLL_9_T_2")
BCLL_8_T_1 <- read_atac_data(paste0(path_to_data_exp1,"ulx1v6sz_8a2nvf1c/"),"BCLL_8_T_1")
BCLL_8_T_2 <- read_atac_data(paste0(path_to_data_exp1,"wdp0p728_jf6w68km/"),"BCLL_8_T_2")

3.2 Read samples from Experiment 2 and assign them to a proper library name

BCLL_14_T_1 <- read_atac_data(paste0(path_to_data_exp2,"pd9avu0k_kf9ft6kk/"),"BCLL_14_T_1")
BCLL_14_T_2 <- read_atac_data(paste0(path_to_data_exp2,"vuuqir4h_wfkyb5v8/"),"BCLL_14_T_2")
BCLL_15_T_1 <- read_atac_data(paste0(path_to_data_exp2,"admae8w2_89i88tvv/"),"BCLL_15_T_1")
BCLL_15_T_2 <- read_atac_data(paste0(path_to_data_exp2,"sr20954q_yiuuoxng/"),"BCLL_15_T_2")
BCLL_2_T_1 <- read_atac_data(paste0(path_to_data_exp2,"kmbfo1ab_ie02b4ny/"),"BCLL_2_T_1")
BCLL_2_T_2 <- read_atac_data(paste0(path_to_data_exp2,"ryh4el3i_biv0w7ca/"),"BCLL_2_T_2")
BCLL_2_T_3 <- read_atac_data(paste0(path_to_data_exp2,"bs2e7lr7_mdfwypvz/"),"BCLL_2_T_3")
libraries <- c(BCLL_9_T_1,BCLL_9_T_2,BCLL_8_T_1,BCLL_8_T_2,
               BCLL_14_T_1,BCLL_14_T_2,BCLL_15_T_1,BCLL_15_T_2,
               BCLL_2_T_1,BCLL_2_T_2,BCLL_2_T_3)

4 Quality control

We can compute per-cell quality metrics taking in account three specifics from DNA accessibility such as TSS.enrichment, nucleosome signal and nCount_ATAC and nCount_RNA from gene expression data.

quality_control <- function(seurat_object) {
  
  DefaultAssay(seurat_object) <- "ATAC"
  seurat_object <- NucleosomeSignal(object = seurat_object)
  seurat_object <- TSSEnrichment(object = seurat_object, fast = FALSE)
  seurat_object$high.tss <- ifelse(seurat_object$TSS.enrichment > 2, "High", "Low")
  
  DefaultAssay(seurat_object) <- "RNA"
  seurat_object[["pct_mt"]] <- PercentageFeatureSet(seurat_object, pattern = "^MT-")
  seurat_object[["pct_ribosomal"]] <- PercentageFeatureSet(seurat_object, pattern = "^RPS")
  
  return(seurat_object)
}
all_data = lapply(libraries, quality_control)
metadata_QC = data.frame()

for (dat in all_data)
  {
    metadata_QC = rbind(metadata_QC,dat@meta.data)
  }

saveRDS(all_data,paste0(path_to_save,"1.tonsil_multiome_QC.rds"))

4.1 scATAC parameters

4.1.1 Number of detected peaks

np <- ggviolin(metadata_QC,
  x = "library_name", fill = "gray70", x.text.angle = 45,
  y = "nFeature_ATAC") + scale_y_log10()
np

print(paste("The median of total number of peaks is:", median(metadata_QC$nFeature_ATAC)))
## [1] "The median of total number of peaks is: 3331"
print("The summary of total number of peaks per library")
## [1] "The summary of total number of peaks per library"
aggregate(nFeature_ATAC ~ library_name, data = metadata_QC, median)
##    library_name nFeature_ATAC
## 1   BCLL_14_T_1        5009.0
## 2   BCLL_14_T_2        5001.0
## 3   BCLL_15_T_1        4264.0
## 4   BCLL_15_T_2        4160.0
## 5    BCLL_2_T_1        1611.0
## 6    BCLL_2_T_2        1598.5
## 7    BCLL_2_T_3        2223.0
## 8    BCLL_8_T_1        4656.0
## 9    BCLL_8_T_2        4355.0
## 10   BCLL_9_T_1        5552.0
## 11   BCLL_9_T_2        5461.5

4.1.2 Nucleosome banding pattern

Quantification of the ratio of mononucleosomal (fragment lengths between 147 and 294bp) to nucleosome-free fragments(less than 147).The red dashed line represent the upper threshold applied.

nbp <- function(seurat_object) {
  DefaultAssay(seurat_object) <- "ATAC"
  f <- FragmentHistogram(object = seurat_object)

  NFR <- length(which(f$data$length < 147)) / nrow(f$data) * 100
  MONO <- length(which(f$data$length > 147 & f$data$length < 294)) / nrow(f$data) * 100
  DI <- length(which(f$data$length > 294)) / nrow(f$data) * 100

  min.threshold <- 147
  max.threshold <- 294

  # options(repr.plot.width=7, repr.plot.height=2)
  options(repr.plot.width = 17, repr.plot.height = 8)
  p <- ggplot(f$data, aes(length)) + ggtitle(unique(seurat_object$library_name)) +
    geom_histogram(binwidth = 1, alpha = .5, fill = "blue") +
    geom_density(aes(y = ..count..), bw = 1, alpha = 0, col = "black", lwd = 1) +
    scale_x_continuous(limits = c(0, 500)) +
    geom_vline(xintercept = c(min.threshold, max.threshold)) +
    theme_minimal() +
    geom_vline(xintercept = 39, color = "red") +
    geom_text(x = 80, y = 20, label = round(NFR, 2), size = 8) +
    geom_text(x = 200, y = 20, label = round(MONO, 2), size = 8) +
    geom_text(x = 350, y = 20, label = round(DI, 2), size = 8)
  print(p)
}
plot_nbp = lapply(all_data, nbp)

plot_nbp
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

ns <- ggviolin(metadata_QC,
  x = "library_name", fill = "gray70", x.text.angle = 45,
  y = "nucleosome_signal") + scale_y_log10() + geom_hline(yintercept = nucleosome_signal_atac, linetype='dashed', col = 'black')
ns

4.1.3 Transcriptional start site (TSS) enrichment score

Here, we can see the plot of the normalized TSS enrichment score at each position relative to the TSS. The red dashed line represent the upper threshold applied.

plot_tss <- function(seurat_object){
 DefaultAssay(seurat_object) <- "ATAC" 
 Signac::TSSPlot(seurat_object, group.by = "high.tss") + 
   ggtitle(paste("TSS enrichment score", unique(seurat_object$library_name)))
}

plots_tss_tonsil = lapply(all_data, plot_tss)
plots_tss_tonsil
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

tss <- ggviolin(metadata_QC,
  x = "library_name", fill = "gray70", x.text.angle = 45,
  y = "TSS.enrichment") + geom_hline(yintercept = (2), linetype='dashed', col = 'black') + scale_y_log10()
tss

4.1.4 Library size

The red dashed lines represent the lower and the upper thresholds applied.

ns <- ggviolin(metadata_QC,
  x = "library_name", fill = "gray70", x.text.angle = 45,
  y = "nCount_ATAC") +  scale_y_log10() + 
  geom_hline(yintercept = c(min_lib_size_atac,max_lib_size_atac), linetype='dashed', col = 'black')
ns

lib_size_hist <- metadata_QC  %>%
  plot_histogram_qc(x = "nCount_ATAC", x_lab = "Library Size (log10)") +
  geom_vline(xintercept = min_lib_size_atac, linetype = "dashed", color = "black")
lib_size_hist1 <- lib_size_hist +
    scale_x_log10() +
    geom_vline(xintercept = max_lib_size_atac, linetype = "dashed", color = "black")
lib_size_hist2 <- lib_size_hist +
    scale_x_continuous(limits = c(0, 5000)) +
    xlab("Library Size") +
    theme_pubr()
lib_size_hist1 + lib_size_hist2

4.2 scRNA parameters

We aim to detect and exclude empty droplets or lysed cells. Lysed cells have 3 hallmarks: (1) low library size (total UMI), (2) low library complexity (number of detected genes) and (3) high fraction of mitochondrial expression (cytosolic mRNA leaks out of the cell). Let us start by visualizing their univariate distributions.

4.2.1 Library Size

The red dashed lines represent the lower and the upper thresholds applied.

nc <- ggviolin(metadata_QC,
  x = "library_name", fill = "library_name", x.text.angle = 45,
  y = "nCount_RNA") +  scale_y_log10() + 
  geom_hline(yintercept = c(min_lib_size_rna,max_lib_size_rna), linetype='dashed', col = 'black')
nc

lib_size_hist <- metadata_QC  %>%
  plot_histogram_qc(x = "nCount_RNA", x_lab = "Library Size (log10(total UMI))") +
  geom_vline(xintercept = min_lib_size_rna, linetype = "dashed", color = "black")
lib_size_hist1 <- lib_size_hist +
    scale_x_log10() +
    geom_vline(xintercept = max_lib_size_rna, linetype = "dashed", color = "black")
lib_size_hist2 <- lib_size_hist +
    scale_x_continuous(limits = c(0, 2000)) +
    xlab("Library Size (total UMI)") +
    theme_pubr()
lib_size_hist1 + lib_size_hist2

4.2.2 Number of detected genes

The red dashed lines represent the lower and upper thresholds applied. However, cells with a value higher than the upper threshold will be kept for the downstream analysis.

metadata_QC$has_high_lib_size <- 
  metadata_QC$nCount_RNA > max_lib_size_rna |
  metadata_QC$nFeature_RNA > max_n_genes
ns <- ggviolin(metadata_QC,
  x = "library_name", fill = "gray70", x.text.angle = 45,
  y = "nFeature_RNA") +  scale_y_log10() + 
  geom_hline(yintercept = c(min_n_genes,max_n_genes), linetype='dashed', col = 'black')
ns

n_genes_hist1 <- metadata_QC %>%
  plot_histogram_qc(x = "nFeature_RNA", x_lab = "Number of Detected Genes") +
  geom_vline(xintercept = min_n_genes, linetype = "dashed", color = "black") +
  geom_vline(xintercept = max_n_genes, linetype = "dashed", color = "black")
n_genes_hist2 <- n_genes_hist1 +
  scale_x_continuous(limits = c(0, 2000)) 
n_genes_hist1 + n_genes_hist2

print(paste("The median of total number of genes is:", median(metadata_QC$nFeature_RNA)))
## [1] "The median of total number of genes is: 1764"
print("The summary of total number of genes per library")
## [1] "The summary of total number of genes per library"
aggregate(nFeature_RNA ~ library_name, data = metadata_QC, median)
##    library_name nFeature_RNA
## 1   BCLL_14_T_1         1736
## 2   BCLL_14_T_2         1659
## 3   BCLL_15_T_1         1713
## 4   BCLL_15_T_2         1895
## 5    BCLL_2_T_1         1291
## 6    BCLL_2_T_2         1584
## 7    BCLL_2_T_3         1586
## 8    BCLL_8_T_1         2318
## 9    BCLL_8_T_2         2117
## 10   BCLL_9_T_1         1997
## 11   BCLL_9_T_2         2142

4.2.3 Library Size vs library complexity

ggscatter(metadata_QC, x = "nCount_RNA", y = "nFeature_RNA",
  color = "library_name") 

4.2.4 Fraction of mitochondrial expression

pct_mt_hist <- metadata_QC %>%
  plot_histogram_qc(x = "pct_mt", x_lab = "% Mitochondrial Expression") +
  geom_vline(xintercept = max_pct_mt, linetype = "dashed", color = "black") +
  scale_x_continuous(limits = c(0, 100))

pct_mt_hist

4.2.5 Fraction of ribosomal expression

Let us also calculate and visualize the proportion of ribosomal expression:

metadata_QC %>% plot_histogram_qc(x = "pct_ribosomal", x_lab = "% Ribosomal Expression")

5 Filtering

Finally we remove cells that are outliers for these QC metrics.

filtering_QC <- function(seurat_object) {
  seurat_object <- subset(
    x = seurat_object, 
    nCount_ATAC < max_lib_size_atac &
    pct_mt < max_pct_mt &
    nCount_ATAC > min_lib_size_atac &
    nCount_RNA > min_lib_size_rna &
    nFeature_RNA > min_n_genes &
    nucleosome_signal < nucleosome_signal_atac &
    TSS.enrichment > TSS_enrichment)
  
  return(seurat_object)
}
tonsil_data_filtered = lapply(all_data, filtering_QC)
saveRDS(tonsil_data_filtered,paste0(path_to_save,"2.tonsil_multiome_filtered.rds"))

print(paste("Number of total filtered cells:", sum(melt(lapply(tonsil_data_filtered, ncol))$value)))
## [1] "Number of total filtered cells: 69118"

6 Session info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Tonsil_atlas/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] hdf5r_1.3.3               EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.12.1          AnnotationFilter_1.12.0   GenomicFeatures_1.40.1    AnnotationDbi_1.50.3      Biobase_2.48.0            GenomicRanges_1.40.0      GenomeInfoDb_1.24.0       IRanges_2.22.1            S4Vectors_0.26.0          BiocGenerics_0.34.0       data.table_1.13.2         reshape2_1.4.4            plyr_1.8.6                forcats_0.5.0             stringr_1.4.0             dplyr_1.0.2               purrr_0.3.4               readr_1.4.0               tidyr_1.1.2               tibble_3.0.4              tidyverse_1.3.0           ggpubr_0.4.0              ggplot2_3.3.2             Seurat_3.9.9.9010         Signac_1.1.0.9000         BiocStyle_2.16.1         
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.1              SnowballC_0.7.0             rtracklayer_1.48.0          GGally_2.0.0                bit64_4.0.5                 knitr_1.30                  irlba_2.3.3                 DelayedArray_0.14.0         rpart_4.1-15                RCurl_1.98-1.2              generics_0.1.0              cowplot_1.1.0               RSQLite_2.2.1               RANN_2.6.1                  future_1.20.1               bit_4.0.4                   spatstat.data_1.4-3         xml2_1.3.2                  lubridate_1.7.9             httpuv_1.5.4                SummarizedExperiment_1.18.1 assertthat_0.2.1            xfun_0.18                   hms_0.5.3                   evaluate_0.14               promises_1.1.1              fansi_0.4.1                 progress_1.2.2              dbplyr_1.4.4                readxl_1.3.1                igraph_1.2.6                DBI_1.1.0                   htmlwidgets_1.5.2           reshape_0.8.8               ellipsis_0.3.1              backports_1.2.0             bookdown_0.21               biomaRt_2.44.4              deldir_0.2-3                vctrs_0.3.4                 here_1.0.1                  ROCR_1.0-11                
##  [43] abind_1.4-5                 withr_2.3.0                 ggforce_0.3.2               BSgenome_1.56.0             checkmate_2.0.0             sctransform_0.3.1           GenomicAlignments_1.24.0    prettyunits_1.1.1           goftest_1.2-2               cluster_2.1.0               lazyeval_0.2.2              crayon_1.3.4                labeling_0.4.2              pkgconfig_2.0.3             tweenr_1.0.1                nlme_3.1-150                ProtGenerics_1.20.0         nnet_7.3-14                 rlang_0.4.8                 globals_0.13.1              lifecycle_0.2.0             miniUI_0.1.1.1              BiocFileCache_1.12.1        modelr_0.1.8                rsvd_1.0.3                  dichromat_2.0-0             rprojroot_2.0.2             cellranger_1.1.0            polyclip_1.10-0             matrixStats_0.57.0          lmtest_0.9-38               graph_1.66.0                Matrix_1.2-18               ggseqlogo_0.1               carData_3.0-4               zoo_1.8-8                   reprex_0.3.0                base64enc_0.1-3             ggridges_0.5.2              png_0.1-7                   viridisLite_0.3.0           bitops_1.0-6               
##  [85] KernSmooth_2.23-17          Biostrings_2.56.0           blob_1.2.1                  parallelly_1.21.0           jpeg_0.1-8.1                rstatix_0.6.0               ggsignif_0.6.0              scales_1.1.1                memoise_1.1.0               magrittr_1.5                ica_1.0-2                   zlibbioc_1.34.0             compiler_4.0.3              RColorBrewer_1.1-2          fitdistrplus_1.1-1          Rsamtools_2.4.0             cli_2.1.0                   XVector_0.28.0              listenv_0.8.0               patchwork_1.1.0             pbapply_1.4-3               htmlTable_2.1.0             Formula_1.2-4               MASS_7.3-53                 mgcv_1.8-33                 tidyselect_1.1.0            stringi_1.5.3               yaml_2.2.1                  askpass_1.1                 latticeExtra_0.6-29         ggrepel_0.8.2               grid_4.0.3                  VariantAnnotation_1.34.0    fastmatch_1.1-0             tools_4.0.3                 future.apply_1.6.0          rio_0.5.16                  rstudioapi_0.12             foreign_0.8-80              lsa_0.73.2                  gridExtra_2.3               farver_2.0.3               
## [127] Rtsne_0.15                  digest_0.6.27               BiocManager_1.30.10         shiny_1.5.0                 Rcpp_1.0.5                  car_3.0-10                  broom_0.7.2                 later_1.1.0.1               RcppAnnoy_0.0.16            OrganismDbi_1.30.0          httr_1.4.2                  ggbio_1.36.0                biovizBase_1.36.0           colorspace_2.0-0            rvest_0.3.6                 XML_3.99-0.3                fs_1.5.0                    tensor_1.5                  reticulate_1.18             splines_4.0.3               uwot_0.1.8.9001             RBGL_1.64.0                 RcppRoll_0.3.0              spatstat.utils_1.17-0       plotly_4.9.2.1              xtable_1.8-4                jsonlite_1.7.1              spatstat_1.64-1             R6_2.5.0                    Hmisc_4.4-1                 pillar_1.4.6                htmltools_0.5.0             mime_0.9                    glue_1.4.2                  fastmap_1.0.1               BiocParallel_1.22.0         codetools_0.2-17            lattice_0.20-41             curl_4.3                    leiden_0.3.5                zip_2.1.1                   openxlsx_4.2.3             
## [169] openssl_1.4.3               survival_3.2-7              rmarkdown_2.5               munsell_0.5.0               GenomeInfoDbData_1.2.3      haven_2.3.1                 gtable_0.3.0